# Author: Mark Richardson
# Date started: 10/13/2022
# Date finished: 11/04/2022
# Purpose: Estimate performance ratings using partisan models
# Proofed 11/05/2023

# Date revised: 02/21/2024
# Revision purpose: Log R output

# Open a log
library(logr)

lf <- log_open(file_name = "03_performance_ratings_estimation_pid_models.log")

log_code()

# Load packages
library(dplyr)
library(ggplot2)

library(rstan)
rstan_options(auto_write = TRUE)
options(mc.cores = parallel::detectCores())

library(stringr)

# Load data
load("data/ratings/performance_ratings/01_performance_ratings_for_model.RData")

#### Code appointee variable ####

perf_ratings <- perf_ratings %>%
  mutate(appointee = if_else(app == "appointee", 1, 0))

#### Create a ratings-level data frame ####

perf_stan <- list()

for (i in 1:5) {
  
  var_num <- paste0("_", i)
  
  perf_stan[[i]] <- perf_ratings %>%
    select(id:office, pid_3:ideology, appointee, contains(var_num)) %>%
    rename("agency_rated" = paste0("agency_rated", var_num),
           "perf_rating_org" = paste0("perf_rating", var_num, "_org"),
           "perf_rating" = paste0("perf_rating", var_num))
  
}

perf_stan <- bind_rows(perf_stan) %>%
  tidyr::drop_na(perf_rating) # Drop rows with no rating

rm(i, var_num)

# Code partisanship variables

table(perf_ratings$pid_probe, perf_ratings$pid_3, useNA = "always")

perf_stan <- perf_stan %>%
  mutate(pid_lean = case_when(pid_probe == "Closer to the Democratic Party" ~ "Lean Dem",
                              pid_probe == "Neither" ~ "Ind",
                              pid_probe == "Closer to the Republican Party" ~ "Lean Rep",
                              pid_3 == "Democrat" ~ "Dem",
                              pid_3 == "Republican" ~ "Rep",
                              pid_3 == "Independent" & (pid_probe == "Don't know" | pid_probe == "Refused" | is.na(pid_probe)) ~ "Ind")) %>%
  mutate(pid_plus = case_when(pid_lean == "Lean Dem" | pid_lean == "Dem" ~ "Dem",
                              pid_lean == "Ind" ~ "Ind",
                              pid_lean == "Lean Rep" | pid_lean == "Rep" ~ "Rep")) %>%
  mutate(rep = if_else(pid_plus == "Rep", 1, 0),
         dem = if_else(pid_plus == "Dem", 1, 0))


# Drop raters with no pid response

perf_stan <- perf_stan %>%
  filter(!is.na(rep))

table(is.na(perf_stan$rep), is.na(perf_stan$dem))

#### Wrangle the ratings ####

# Calculate ratings per agency
perf_stan <- perf_stan %>%
  group_by(agency_rated) %>%
  mutate(n_ratings_agency = n()) %>%
  ungroup()

table(perf_stan$n_ratings_agency)

# Calculate number of ratings per rater
perf_stan <- perf_stan %>%
  group_by(id) %>%
  mutate(n_ratings_rater = n()) %>%
  ungroup()

table(perf_stan$n_ratings_rater)

#### Prepare data for stan ####

# Create a dept index and get raters per dept

dept_data <- perf_stan %>%
  distinct(dept, id) %>%
  group_by(dept) %>%
  mutate(n_raters_dept = n()) %>%
  ungroup() %>%
  distinct(dept, n_raters_dept) %>%
  arrange(dept) %>%
  mutate(dept_index = 1:n())

# Create an agency index

agency_data <- perf_stan %>%
  distinct(agency_rated) %>%
  arrange(agency_rated) %>%
  mutate(agency_index = 1:n())

# Create a rater_index

rater_data <- perf_stan %>%
  distinct(id) %>%
  mutate(rater_index = 1:n())

# Join indices with ratings-level data

n <- nrow(perf_stan)

perf_stan <- perf_stan %>%
  full_join(agency_data, by = "agency_rated") %>%
  full_join(rater_data, by = "id") %>%
  full_join(dept_data, by = "dept")

nrow(perf_stan) == n # No rows added

rm(n)

# Get rater and dept index combinations for non-centered parameterization

dept_rater_grp <- perf_stan %>%
  select(rater_index, dept_index) %>%
  distinct() %>%
  arrange(rater_index)

# Prepare informed ratings

agency_data <- agency_data %>%
  left_join(perf_inf_priors, by = "agency_rated") %>%
  rename(n_inf_ratings = n)

# Set informed prior for cases with few ratings
# Set standard deviation for cases with 5 or fewer informed ratings
# to max observed standard deviation

agency_data <- agency_data %>%
  mutate(perf_inf_se = sqrt(perf_inf_var / n_inf_ratings),
         perf_inf_sd = sqrt(perf_inf_var),
         perf_inf_mean_stan = if_else(is.na(perf_inf_mean), 0, perf_inf_mean),
         perf_inf_sd_stan = if_else(n_inf_ratings <= 5 | is.na(perf_inf_sd),
                                    max(perf_inf_sd, na.rm = TRUE), perf_inf_sd))

#### Assemble the data for Stan ####

for_stan_pid <- list(y         = perf_stan$perf_rating, # Vector of ratings
                     N         = length(perf_stan$perf_rating), # Number of ratings
                     aa        = perf_stan$agency_index, # Vector of agency indexes
                     A         = length(unique(perf_stan$agency_index)), # Number of agencies
                     rr        = perf_stan$rater_index, # Rater index
                     R         = length(unique(perf_stan$rater_index)), # Number of raters
                     dd        = dept_rater_grp$dept_index, # Dept index (for mapping raters to departments)
                     D         = length(unique(perf_stan$dept_index)), # Number of departments
                     mu_prior  = agency_data %>% arrange(agency_index) %>% pull(perf_inf_mean_stan), # Vector of informed means
                     sd_prior  = agency_data %>% arrange(agency_index) %>% pull(perf_inf_sd_stan), # Vector of informed standard deviations
                     sd_naive  = 1.5, # Standard deviation for the naive prior
                     rep       = perf_stan$rep, # Rep indicator
                     dem       = perf_stan$dem, # Dem indicator
                     appointee = perf_stan$appointee) # Appointee indicator

#### Fit the models ####

#### Informed hierarchical model ####

perf_inf_hier_base <- stan(file = "code/ratings/stan_models/ratings_model_informed_sd_hierarchical_multi.stan",
                           data = for_stan_pid,
                           chains = 5,
                           warmup = 1000,
                           iter = 4000,
                           cores = 5,
                           refresh = 1000,
                           init = list(chain_1 = list(x = rep(-3, times = for_stan_pid$A)),
                                       chain_2 = list(x = rep(-2, times = for_stan_pid$A)),
                                       chain_3 = list(x = rep( 0, times = for_stan_pid$A)),
                                       chain_4 = list(x = rep( 2, times = for_stan_pid$A)),
                                       chain_5 = list(x = rep( 3, times = for_stan_pid$A))),
                           seed = 31031617,
                           control = list(adapt_delta = 0.99,
                                          max_treedepth = 20,
                                          stepsize = 0.001),
                           diagnostic_file = "data/ratings/performance_ratings/diagnostics/perf_diag_informed_sd_hier_pid_base.csv")

get_elapsed_time(perf_inf_hier_base) / 60^2

check_treedepth(perf_inf_hier_base)
check_energy(perf_inf_hier_base)
check_divergences(perf_inf_hier_base)

save.image(file = "data/ratings/performance_ratings/saves_during_estimation/perf_in_process_pid.RData")

#### Naive hierarchical model ####

perf_naive_hier_base <- stan(file = "code/ratings/stan_models/ratings_model_naive_hierarchical_multi.stan",
                             data = for_stan_pid,
                             chains = 5,
                             warmup = 1000,
                             iter = 4000,
                             cores = 5,
                             refresh = 1000,
                             init = list(chain_1 = list(x = rep(-3, times = for_stan_pid$A)),
                                         chain_2 = list(x = rep(-2, times = for_stan_pid$A)),
                                         chain_3 = list(x = rep( 0, times = for_stan_pid$A)),
                                         chain_4 = list(x = rep( 2, times = for_stan_pid$A)),
                                         chain_5 = list(x = rep( 3, times = for_stan_pid$A))),
                             seed = 31031617,
                             control = list(adapt_delta = 0.99,
                                            max_treedepth = 20,
                                            stepsize = 0.001),
                             diagnostic_file = "data/ratings/performance_ratings/diagnostics/perf_diag_naive_sd_hier_pid_base.csv")

get_elapsed_time(perf_naive_hier_base) / 60^2

check_treedepth(perf_naive_hier_base)
check_energy(perf_naive_hier_base)
check_divergences(perf_naive_hier_base)

save.image(file = "data/ratings/performance_ratings/saves_during_estimation/perf_in_process_pid.RData")

#### Partisan models ####

#### Informed hierarchical model - partisan ####

perf_inf_hier_pid <- stan(file = "code/ratings/stan_models/partisan_performance_models/ratings_model_informed_sd_hierarchical_multi_pid.stan",
                          data = for_stan_pid,
                          chains = 5,
                          warmup = 1000,
                          iter = 4000,
                          cores = 5,
                          refresh = 1000,
                          init = list(chain_1 = list(x = rep(-3, times = for_stan_pid$A)),
                                      chain_2 = list(x = rep(-2, times = for_stan_pid$A)),
                                      chain_3 = list(x = rep( 0, times = for_stan_pid$A)),
                                      chain_4 = list(x = rep( 2, times = for_stan_pid$A)),
                                      chain_5 = list(x = rep( 3, times = for_stan_pid$A))),
                          seed = 31031617,
                          control = list(adapt_delta = 0.99,
                                         max_treedepth = 20,
                                         stepsize = 0.001),
                          diagnostic_file = "data/ratings/performance_ratings/diagnostics/perf_diag_informed_sd_hier_pid.csv")

get_elapsed_time(perf_inf_hier_pid) / 60^2

check_treedepth(perf_inf_hier_pid)
check_energy(perf_inf_hier_pid)
check_divergences(perf_inf_hier_pid)

save.image(file = "data/ratings/performance_ratings/saves_during_estimation/perf_in_process_pid.RData")

#### Naive hierarchical model - partisan ####

perf_naive_hier_pid <- stan(file = "code/ratings/stan_models/partisan_performance_models/ratings_model_naive_sd_hierarchical_multi_pid.stan",
                            data = for_stan_pid,
                            chains = 5,
                            warmup = 1000,
                            iter = 4000,
                            cores = 5,
                            refresh = 1000,
                            init = list(chain_1 = list(x = rep(-3, times = for_stan_pid$A)),
                                        chain_2 = list(x = rep(-2, times = for_stan_pid$A)),
                                        chain_3 = list(x = rep( 0, times = for_stan_pid$A)),
                                        chain_4 = list(x = rep( 2, times = for_stan_pid$A)),
                                        chain_5 = list(x = rep( 3, times = for_stan_pid$A))),
                            seed = 31031617,
                            control = list(adapt_delta = 0.99,
                                           max_treedepth = 20,
                                           stepsize = 0.001),
                            diagnostic_file = "data/ratings/performance_ratings/diagnostics/perf_diag_naive_sd_hier_pid.csv")

get_elapsed_time(perf_naive_hier_pid) / 60^2

check_treedepth(perf_naive_hier_pid)
check_energy(perf_naive_hier_pid)
check_divergences(perf_naive_hier_pid)

save.image(file = "data/ratings/performance_ratings/saves_during_estimation/perf_in_process_pid.RData")

#### Model with separate ratings for Dems and Reps ####

# Subset to only dems and reps and get ratings per agency

perf_stan_pid <- perf_stan %>%
  filter( (dem == 1 | rep == 1) ) %>%
  group_by(agency_rated) %>%
  mutate(n_dem_ratings_agency = sum(dem, na.rm = TRUE),
         n_rep_ratings_agency = sum(rep, na.rm = TRUE)) %>%
  ungroup()

table(perf_stan_pid %>% distinct(agency_rated, n_dem_ratings_agency) %>% pull(n_dem_ratings_agency)) # 6 have zero

table(perf_stan_pid %>% distinct(agency_rated, n_rep_ratings_agency) %>% pull(n_rep_ratings_agency)) # 30 have zero

# Subset to agencies with at least one rating per party and create indicator variable for fixed agencies

perf_stan_pid <- perf_stan_pid %>%
  filter(n_dem_ratings_agency > 0 & n_rep_ratings_agency > 0)

# Create indicator variable for fixed agencies and agency index

agency_data_pid <- perf_stan_pid %>%
  distinct(agency_rated, n_dem_ratings_agency, n_rep_ratings_agency) %>%
  mutate(fixed = if_else(agency_rated == "Office of Personnel Management" |
                           agency_rated == "Office of Management and Budget" |
                           agency_rated == "Coast Guard", 1, 0)) %>%
  group_by(fixed) %>%
  arrange(agency_rated) %>%
  mutate(agency_index = 1:n()) %>% # Creates index within groups
  ungroup()

# Create a dept index and get raters per dept

dept_data_pid <- perf_stan_pid %>%
  distinct(dept, id) %>%
  group_by(dept) %>%
  mutate(n_raters_dept = n()) %>%
  ungroup() %>%
  distinct(dept, n_raters_dept) %>%
  arrange(dept) %>%
  mutate(dept_index = 1:n())

# Rater index
rater_data_pid <- perf_stan_pid %>%
  distinct(id) %>%
  arrange(id) %>%
  mutate(rater_index = 1:n())

# Join indices with ratings-level data

n <- nrow(perf_stan_pid)

perf_stan_pid <-  perf_stan_pid %>%
  select(!c(rater_index, agency_index, dept_index, n_ratings_agency, n_raters_dept))

perf_stan_pid <- perf_stan_pid %>%
  full_join(agency_data_pid %>% select(agency_rated, agency_index, fixed), by = "agency_rated") %>%
  full_join(rater_data_pid, by = "id") %>%
  full_join(dept_data_pid, by = "dept")

nrow(perf_stan_pid) == n # No rows added

rm(n)

# Get rater and dept index combinations for non-centered parameterization

dept_rater_grp_pid <- perf_stan_pid %>%
  select(rater_index, dept_index) %>%
  distinct() %>%
  arrange(rater_index)

# Prepare data for Stan

for_stan_pid_sep <- list(y        = perf_stan_pid$perf_rating, # Vector of ratings
                         N        = length(perf_stan_pid$perf_rating), # Number of ratings
                         aa       = perf_stan_pid$agency_index, # Vector of agency indexes
                         A_fixed  = sum(agency_data_pid$fixed == TRUE), # Number of agencies with fixed performance
                         A_pid    = sum(agency_data_pid$fixed == FALSE), # Number of agencies with partisan performance
                         rr       = perf_stan_pid$rater_index, # Rater index
                         R        = length(unique(perf_stan_pid$rater_index)), # Number of raters
                         dd       = dept_rater_grp_pid$dept_index, # Dept index (for mapping raters to departments)
                         D        = length(unique(dept_rater_grp_pid$dept_index)), # Number of departments
                         sd_naive = 1.5, # Standard deviation for naive prior
                         dem      = perf_stan_pid$dem, # Dem indicator
                         fixed    = perf_stan_pid$fixed) # Fixed across parties indicator variable

#### Fit the partisan sep model - dept hierarchy ####

perf_pid_sep <- stan(file = "code/ratings/stan_models/partisan_performance_models/ratings_model_naive_hierarchical_multi_pid_separate.stan",
                     data = for_stan_pid_sep,
                     chains = 5,
                     warmup = 1000,
                     iter = 4000,
                     cores = 5,
                     refresh = 100,
                     init = list(chain_1 = list(x_fixed = rep(-3, times = for_stan_pid_sep$A_fixed),
                                                x_dem   = rep(-3, times = for_stan_pid_sep$A_pid),
                                                x_rep   = rep(-3, times = for_stan_pid_sep$A_pid)),
                                 chain_2 = list(x_fixed = rep(-2, times = for_stan_pid_sep$A_fixed),
                                                x_dem   = rep(-2, times = for_stan_pid_sep$A_pid),
                                                x_rep   = rep(-2, times = for_stan_pid_sep$A_pid)),
                                 chain_3 = list(x_fixed = rep( 0, times = for_stan_pid_sep$A_fixed),
                                                x_dem   = rep( 0, times = for_stan_pid_sep$A_pid),
                                                x_rep   = rep( 0, times = for_stan_pid_sep$A_pid)),
                                 chain_4 = list(x_fixed = rep( 2, times = for_stan_pid_sep$A_fixed),
                                                x_dem   = rep( 2, times = for_stan_pid_sep$A_pid),
                                                x_rep   = rep( 2, times = for_stan_pid_sep$A_pid)),
                                 chain_5 = list(x_fixed = rep( 3, times = for_stan_pid_sep$A_fixed),
                                                x_dem   = rep( 3, times = for_stan_pid_sep$A_pid),
                                                x_rep   = rep( 3, times = for_stan_pid_sep$A_pid))),
                       seed = 31031617,
                       control = list(adapt_delta = 0.975,
                                      max_treedepth = 20,
                                      stepsize = 0.001),
                       diagnostic_file = "data/ratings/performance_ratings/diagnostics/perf_diag_naive_pid_sep.csv")

get_elapsed_time(perf_pid_sep) / 60^2

check_treedepth(perf_pid_sep)
check_energy(perf_pid_sep)
check_divergences(perf_pid_sep)

save.image(file = "data/ratings/performance_ratings/saves_during_estimation/perf_in_process_pid.RData")

#### Fit the partisan sep model - party hierarchy ####

# Get rater party mapping for non-centered parameterization

rater_grp_pid <- perf_stan_pid %>%
  select(rater_index, dem) %>%
  distinct() %>%
  mutate(dem = dem + 1) %>%
  arrange(rater_index)

# Prepare data for Stan

for_stan_pid_sep_pty_grp <- list(y        = perf_stan_pid$perf_rating, # Vector of ratings
                                 N        = length(perf_stan_pid$perf_rating), # Number of ratings
                                 aa       = perf_stan_pid$agency_index, # Vector of agency indexes
                                 A_fixed  = sum(agency_data_pid$fixed == TRUE), # Number of agencies with fixed performance
                                 A_pid    = sum(agency_data_pid$fixed == FALSE), # Number of agencies with partisan performance
                                 rr       = perf_stan_pid$rater_index, # Rater index
                                 R        = length(unique(perf_stan_pid$rater_index)), # Number of raters
                                 dd       = rater_grp_pid$dem, # Party index (for mapping raters to parties)
                                 D        = length(unique(rater_grp_pid$dem)), # Number of parties
                                 sd_naive = 1.5, # Standard deviation for naive prior
                                 dem      = perf_stan_pid$dem, # Dem indicator
                                 fixed    = perf_stan_pid$fixed) # Fixed across parties indicator variable


perf_pid_sep_pty_grp <- stan(file = "code/ratings/stan_models/partisan_performance_models/ratings_model_naive_hierarchical_multi_pid_separate.stan",
                     data = for_stan_pid_sep_pty_grp,
                     chains = 5,
                     warmup = 1000,
                     iter = 4000,
                     cores = 5,
                     refresh = 100,
                     init = list(chain_1 = list(x_fixed = rep(-3, times = for_stan_pid_sep$A_fixed),
                                                x_dem   = rep(-3, times = for_stan_pid_sep$A_pid),
                                                x_rep   = rep(-3, times = for_stan_pid_sep$A_pid)),
                                 chain_2 = list(x_fixed = rep(-2, times = for_stan_pid_sep$A_fixed),
                                                x_dem   = rep(-2, times = for_stan_pid_sep$A_pid),
                                                x_rep   = rep(-2, times = for_stan_pid_sep$A_pid)),
                                 chain_3 = list(x_fixed = rep( 0, times = for_stan_pid_sep$A_fixed),
                                                x_dem   = rep( 0, times = for_stan_pid_sep$A_pid),
                                                x_rep   = rep( 0, times = for_stan_pid_sep$A_pid)),
                                 chain_4 = list(x_fixed = rep( 2, times = for_stan_pid_sep$A_fixed),
                                                x_dem   = rep( 2, times = for_stan_pid_sep$A_pid),
                                                x_rep   = rep( 2, times = for_stan_pid_sep$A_pid)),
                                 chain_5 = list(x_fixed = rep( 3, times = for_stan_pid_sep$A_fixed),
                                                x_dem   = rep( 3, times = for_stan_pid_sep$A_pid),
                                                x_rep   = rep( 3, times = for_stan_pid_sep$A_pid))),
                     seed = 31031617,
                     control = list(adapt_delta = 0.975,
                                    max_treedepth = 20,
                                    stepsize = 0.001),
                     diagnostic_file = "data/ratings/performance_ratings/diagnostics/perf_diag_naive_pid_sep_pty_grp.csv")

get_elapsed_time(perf_pid_sep_pty_grp) / 60^2

check_treedepth(perf_pid_sep_pty_grp)
check_energy(perf_pid_sep_pty_grp)
check_divergences(perf_pid_sep_pty_grp)

save.image(file = "data/ratings/performance_ratings/saves_during_estimation/perf_in_process_pid.RData")

hyp <- summary(perf_pid_sep_pty_grp, pars = c("mu_beta", "mu_alpha"))$summary

#### Save fitted models ####

save.image("data/ratings/performance_ratings/03_performance_fitted_models_pid.RData")

log_close()